{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6789eab6-9237-41f7-8323-9f0cd0d6e3d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============GPU================\n",
      "Thu May  9 17:28:35 2024       \n",
      "+---------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 535.161.08             Driver Version: 535.161.08   CUDA Version: 12.2     |\n",
      "|-----------------------------------------+----------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                      |               MIG M. |\n",
      "|=========================================+======================+======================|\n",
      "|   0  NVIDIA GeForce RTX 4090        On  | 00000000:06:10.0 Off |                  Off |\n",
      "|  0%   36C    P8              11W / 450W |      1MiB / 24564MiB |      0%      Default |\n",
      "|                                         |                      |                  N/A |\n",
      "+-----------------------------------------+----------------------+----------------------+\n",
      "|   1  NVIDIA GeForce RTX 4090        On  | 00000000:06:11.0 Off |                  Off |\n",
      "|  0%   36C    P8               4W / 450W |      1MiB / 24564MiB |      0%      Default |\n",
      "|                                         |                      |                  N/A |\n",
      "+-----------------------------------------+----------------------+----------------------+\n",
      "                                                                                         \n",
      "+---------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                            |\n",
      "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
      "|        ID   ID                                                             Usage      |\n",
      "|=======================================================================================|\n",
      "|  No running processes found                                                           |\n",
      "+---------------------------------------------------------------------------------------+\n",
      "============CUDA version================\n",
      "nvcc: NVIDIA (R) Cuda compiler driver\n",
      "Copyright (c) 2005-2023 NVIDIA Corporation\n",
      "Built on Mon_Apr__3_17:16:06_PDT_2023\n",
      "Cuda compilation tools, release 12.1, V12.1.105\n",
      "Build cuda_12.1.r12.1/compiler.32688072_0\n",
      "============CPU================\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "model name\t: Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz\n",
      "============Memory================\n",
      "MemTotal:       257704636 kB\n"
     ]
    }
   ],
   "source": [
    "# GPU\n",
    "print(\"============GPU================\")\n",
    "!nvidia-smi\n",
    "\n",
    "# CUDA version\n",
    "print(\"============CUDA version================\")\n",
    "!nvcc --version\n",
    "\n",
    "# CPU\n",
    "print(\"============CPU================\")\n",
    "!cat /proc/cpuinfo | grep model\\ name\n",
    "\n",
    "# Memory\n",
    "print(\"============Memory================\")\n",
    "!cat /proc/meminfo | grep MemTotal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b9583f02",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cloning into 'llama.cpp'...\n",
      "remote: Enumerating objects: 24130, done.\u001b[K\n",
      "remote: Counting objects: 100% (9002/9002), done.\u001b[K\n",
      "remote: Compressing objects: 100% (534/534), done.\u001b[K\n",
      "remote: Total 24130 (delta 8724), reused 8526 (delta 8467), pack-reused 15128\u001b[K\n",
      "Receiving objects: 100% (24130/24130), 45.90 MiB | 29.09 MiB/s, done.\n",
      "Resolving deltas: 100% (17143/17143), done.\n",
      "/workspace/llama.cpp\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.10/dist-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library.\n",
      "  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
     ]
    }
   ],
   "source": [
    "# Get the code for the first time use\n",
    "!git clone https://github.com/ggerganov/llama.cpp\n",
    "%cd llama.cpp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4f651103-a6c8-4b7d-9f0f-be0553b22acf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ggml-vocab-aquila.gguf\t\t    ggml-vocab-llama-bpe.gguf\n",
      "ggml-vocab-baichuan.gguf\t    ggml-vocab-llama-bpe.gguf.inp\n",
      "ggml-vocab-bert-bge.gguf\t    ggml-vocab-llama-bpe.gguf.out\n",
      "ggml-vocab-bert-bge.gguf.inp\t    ggml-vocab-llama-spm.gguf\n",
      "ggml-vocab-bert-bge.gguf.out\t    ggml-vocab-llama-spm.gguf.inp\n",
      "ggml-vocab-command-r.gguf\t    ggml-vocab-llama-spm.gguf.out\n",
      "ggml-vocab-command-r.gguf.inp\t    ggml-vocab-mpt.gguf\n",
      "ggml-vocab-command-r.gguf.out\t    ggml-vocab-mpt.gguf.inp\n",
      "ggml-vocab-deepseek-coder.gguf\t    ggml-vocab-mpt.gguf.out\n",
      "ggml-vocab-deepseek-coder.gguf.inp  ggml-vocab-phi-3.gguf\n",
      "ggml-vocab-deepseek-coder.gguf.out  ggml-vocab-phi-3.gguf.inp\n",
      "ggml-vocab-deepseek-llm.gguf\t    ggml-vocab-phi-3.gguf.out\n",
      "ggml-vocab-deepseek-llm.gguf.inp    ggml-vocab-qwen2.gguf\n",
      "ggml-vocab-deepseek-llm.gguf.out    ggml-vocab-qwen2.gguf.inp\n",
      "ggml-vocab-falcon.gguf\t\t    ggml-vocab-qwen2.gguf.out\n",
      "ggml-vocab-falcon.gguf.inp\t    ggml-vocab-refact.gguf\n",
      "ggml-vocab-falcon.gguf.out\t    ggml-vocab-refact.gguf.inp\n",
      "ggml-vocab-gpt-2.gguf\t\t    ggml-vocab-refact.gguf.out\n",
      "ggml-vocab-gpt-2.gguf.inp\t    ggml-vocab-stablelm.gguf\n",
      "ggml-vocab-gpt-2.gguf.out\t    ggml-vocab-starcoder.gguf\n",
      "ggml-vocab-gpt-neox.gguf\t    ggml-vocab-starcoder.gguf.inp\n",
      "ggml-vocab-gpt2.gguf\t\t    ggml-vocab-starcoder.gguf.out\n"
     ]
    }
   ],
   "source": [
    "# Obtain the original LLaMA model weights and place them in ./models\n",
    "!ls ./models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "58f434b3-1493-448d-95d3-8954c1aa1267",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]\n",
      "Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]\n",
      "Get:3 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]3m\n",
      "Get:4 http://archive.ubuntu.com/ubuntu jammy InRelease [270 kB][33m       \n",
      "Get:5 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1789 kB]\n",
      "Get:6 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [44.7 kB]\n",
      "Get:7 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1078 kB]\n",
      "Get:8 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [2308 kB]\n",
      "Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [830 kB]\n",
      "Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [27.8 kB]\n",
      "Get:11 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]       \u001b[0m\u001b[33m\u001b[33m\u001b[33m\u001b[33m\n",
      "Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]\n",
      "Get:13 http://archive.ubuntu.com/ubuntu jammy/multiverse amd64 Packages [266 kB]\n",
      "Get:14 http://archive.ubuntu.com/ubuntu jammy/main amd64 Packages [1792 kB]\n",
      "Get:15 http://archive.ubuntu.com/ubuntu jammy/restricted amd64 Packages [164 kB]\n",
      "Get:16 http://archive.ubuntu.com/ubuntu jammy/universe amd64 Packages [17.5 MB]\n",
      "Get:17 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [51.1 kB]\n",
      "Get:18 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [2382 kB]\n",
      "Get:19 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2069 kB]\n",
      "Get:20 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1373 kB]\n",
      "Get:21 http://archive.ubuntu.com/ubuntu jammy-backports/universe amd64 Packages [31.9 kB]\n",
      "Get:22 http://archive.ubuntu.com/ubuntu jammy-backports/main amd64 Packages [81.0 kB]\n",
      "Fetched 32.4 MB in 3s (12.2 MB/s)[33m                        \u001b[0m\u001b[33m\u001b[33m\u001b[33m\u001b[33m\u001b[33m\u001b[33m\n",
      "Reading package lists... Done\n",
      "Building dependency tree... Done\n",
      "Reading state information... Done\n",
      "56 packages can be upgraded. Run 'apt list --upgradable' to see them.\n",
      "Reading package lists... Done\n",
      "Building dependency tree... Done\n",
      "Reading state information... Done\n",
      "The following NEW packages will be installed:\n",
      "  git-lfs\n",
      "0 upgraded, 1 newly installed, 0 to remove and 56 not upgraded.\n",
      "Need to get 3503 kB of archives.\n",
      "After this operation, 10.4 MB of additional disk space will be used.\n",
      "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 git-lfs amd64 3.0.2-1ubuntu0.2 [3503 kB]\n",
      "Fetched 3503 kB in 0s (31.3 MB/s)\u001b[0m\u001b[33m\n",
      "debconf: delaying package configuration, since apt-utils is not installed\n",
      "\n",
      "\u001b7\u001b[0;23r\u001b8\u001b[1ASelecting previously unselected package git-lfs.\n",
      "(Reading database ... 21038 files and directories currently installed.)\n",
      "Preparing to unpack .../git-lfs_3.0.2-1ubuntu0.2_amd64.deb ...\n",
      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [  0%]\u001b[49m\u001b[39m [..........................................................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 20%]\u001b[49m\u001b[39m [###########...............................................] \u001b8Unpacking git-lfs (3.0.2-1ubuntu0.2) ...\n",
      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 40%]\u001b[49m\u001b[39m [#######################...................................] \u001b8Setting up git-lfs (3.0.2-1ubuntu0.2) ...\n",
      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 60%]\u001b[49m\u001b[39m [##################################........................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 80%]\u001b[49m\u001b[39m [##############################################............] \u001b8\n",
      "\u001b7\u001b[0;24r\u001b8\u001b[1A\u001b[J"
     ]
    }
   ],
   "source": [
    "# Get git-lfs to clone large files\n",
    "!apt update\n",
    "!apt install git-lfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4e63016a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/workspace/llama.cpp/models\n",
      "Updated git hooks.\n",
      "Git LFS initialized.\n",
      "Cloning into 'Llama-3-8B-GGUF'...\n",
      "remote: Enumerating objects: 44, done.\u001b[K\n",
      "remote: Counting objects: 100% (41/41), done.\u001b[K\n",
      "remote: Compressing objects: 100% (41/41), done.\u001b[K\n",
      "remote: Total 44 (delta 9), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n",
      "Unpacking objects: 100% (44/44), 2.25 MiB | 4.25 MiB/s, done.\n",
      "Filtering content: 100% (8/8), 9.46 GiB | 8.68 MiB/s, done.\n",
      "Encountered 6 file(s) that may not have been copied correctly on Windows:\n",
      "\tggml-model-Q4_K_M.gguf\n",
      "\tMeta-Llama-3-8B/model-00003-of-00004.safetensors\n",
      "\tMeta-Llama-3-8B/model-00001-of-00004.safetensors\n",
      "\tMeta-Llama-3-8B/original/consolidated.00.pth\n",
      "\tggml-model-f16.gguf\n",
      "\tMeta-Llama-3-8B/model-00002-of-00004.safetensors\n",
      "\n",
      "See: `git lfs help smudge` for more details.\n",
      "Cloning into 'Llama-3-70B-GGUF'...\n",
      "remote: Enumerating objects: 67, done.\u001b[K\n",
      "remote: Counting objects: 100% (64/64), done.\u001b[K\n",
      "remote: Compressing objects: 100% (64/64), done.\u001b[K\n",
      "remote: Total 67 (delta 4), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n",
      "Unpacking objects: 100% (67/67), 2.26 MiB | 4.11 MiB/s, done.\n",
      "Filtering content: 100% (40/40), 22.45 GiB | 11.89 MiB/s, done.\n",
      "Encountered 38 file(s) that may not have been copied correctly on Windows:\n",
      "\tMeta-Llama-3-70B/original/consolidated.06.pth\n",
      "\tMeta-Llama-3-70B/original/consolidated.04.pth\n",
      "\tMeta-Llama-3-70B/original/consolidated.05.pth\n",
      "\tMeta-Llama-3-70B/original/consolidated.02.pth\n",
      "\tMeta-Llama-3-70B/original/consolidated.07.pth\n",
      "\tMeta-Llama-3-70B/original/consolidated.00.pth\n",
      "\tMeta-Llama-3-70B/original/consolidated.03.pth\n",
      "\tMeta-Llama-3-70B/model-00023-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00028-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00013-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00008-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00018-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00003-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00019-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00029-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00009-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00014-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00004-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00024-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00006-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00022-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00017-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00007-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00026-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00027-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/original/consolidated.01.pth\n",
      "\tMeta-Llama-3-70B/model-00012-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00011-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00016-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00021-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00002-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00025-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00020-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00015-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00010-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00005-of-00030.safetensors\n",
      "\tMeta-Llama-3-70B/model-00001-of-00030.safetensors\n",
      "\tggml-model-Q4_K_M.gguf\n",
      "\n",
      "See: `git lfs help smudge` for more details.\n"
     ]
    }
   ],
   "source": [
    "# Clone/Download the model files from Meta HF repo: https://huggingface.co/meta-llama. Or feel free to clone from my HF repo:\n",
    "%cd models\n",
    "!git lfs install\n",
    "!git clone https://huggingface.co/JaaackXD/Llama-3-8B-GGUF\n",
    "!git clone https://huggingface.co/JaaackXD/Llama-3-70B-GGUF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "603a64eb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/workspace/llama.cpp\n"
     ]
    }
   ],
   "source": [
    "!mkdir 8B-v3\n",
    "!mv Llama-3-8B-GGUF/*.gguf 8B-v3\n",
    "!mkdir 70B-v3\n",
    "!mv Llama-3-70B-GGUF/*.gguf 70B-v3\n",
    "%cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "43e9d80e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I ccache not found. Consider installing it for faster compilation.\n",
      "I llama.cpp build info: \n",
      "I UNAME_S:   Linux\n",
      "I UNAME_P:   x86_64\n",
      "I UNAME_M:   x86_64\n",
      "I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion \n",
      "I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE \n",
      "I NVCCFLAGS: -std=c++11 -O3 \n",
      "I LDFLAGS:    \n",
      "I CC:        cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "I CXX:       c++ (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "\n",
      "rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o tests/test-autorelease tests/test-backend-ops tests/test-double-float tests/test-grad0 tests/test-grammar-integration tests/test-grammar-parser tests/test-json-schema-to-grammar tests/test-llama-grammar tests/test-model-load-cancel tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-rope tests/test-sampling tests/test-tokenizer-0 tests/test-tokenizer-1-bpe tests/test-tokenizer-1-spm\n",
      "removed 'build-info.o'\n",
      "removed 'common.o'\n",
      "removed 'console.o'\n",
      "removed 'ggml-alloc.o'\n",
      "removed 'ggml-backend.o'\n",
      "removed 'ggml-cuda.o'\n",
      "removed 'ggml-quants.o'\n",
      "removed 'ggml.o'\n",
      "removed 'grammar-parser.o'\n",
      "removed 'json-schema-to-grammar.o'\n",
      "removed 'llama.o'\n",
      "removed 'ngram-cache.o'\n",
      "removed 'sampling.o'\n",
      "removed 'sgemm.o'\n",
      "removed 'train.o'\n",
      "removed 'unicode-data.o'\n",
      "removed 'unicode.o'\n",
      "removed 'tests/test-c.o'\n",
      "removed 'libllava.a'\n",
      "removed 'benchmark-matmult'\n",
      "removed 'lookup-create'\n",
      "removed 'lookup-merge'\n",
      "removed 'lookup-stats'\n",
      "removed 'common/build-info.cpp'\n",
      "removed 'main'\n",
      "removed 'quantize'\n",
      "removed 'quantize-stats'\n",
      "removed 'perplexity'\n",
      "removed 'imatrix'\n",
      "removed 'embedding'\n",
      "removed 'vdot'\n",
      "removed 'q8dot'\n",
      "removed 'train-text-from-scratch'\n",
      "removed 'convert-llama2c-to-ggml'\n",
      "removed 'simple'\n",
      "removed 'batched'\n",
      "removed 'batched-bench'\n",
      "removed 'save-load-state'\n",
      "removed 'server'\n",
      "removed 'gguf'\n",
      "removed 'gguf-split'\n",
      "removed 'eval-callback'\n",
      "removed 'llama-bench'\n",
      "removed 'llava-cli'\n",
      "removed 'baby-llama'\n",
      "removed 'beam-search'\n",
      "removed 'retrieval'\n",
      "removed 'speculative'\n",
      "removed 'infill'\n",
      "removed 'tokenize'\n",
      "removed 'parallel'\n",
      "removed 'finetune'\n",
      "removed 'export-lora'\n",
      "removed 'lookahead'\n",
      "removed 'lookup'\n",
      "removed 'passkey'\n",
      "removed 'gritlm'\n",
      "rm -vrf ggml-cuda/*.o\n",
      "removed 'ggml-cuda/acc.o'\n",
      "removed 'ggml-cuda/alibi.o'\n",
      "removed 'ggml-cuda/arange.o'\n",
      "removed 'ggml-cuda/argsort.o'\n",
      "removed 'ggml-cuda/binbcast.o'\n",
      "removed 'ggml-cuda/clamp.o'\n",
      "removed 'ggml-cuda/concat.o'\n",
      "removed 'ggml-cuda/convert.o'\n",
      "removed 'ggml-cuda/cpy.o'\n",
      "removed 'ggml-cuda/diagmask.o'\n",
      "removed 'ggml-cuda/dmmv.o'\n",
      "removed 'ggml-cuda/fattn.o'\n",
      "removed 'ggml-cuda/getrows.o'\n",
      "removed 'ggml-cuda/im2col.o'\n",
      "removed 'ggml-cuda/mmq.o'\n",
      "removed 'ggml-cuda/mmvq.o'\n",
      "removed 'ggml-cuda/norm.o'\n",
      "removed 'ggml-cuda/pad.o'\n",
      "removed 'ggml-cuda/pool2d.o'\n",
      "removed 'ggml-cuda/quantize.o'\n",
      "removed 'ggml-cuda/rope.o'\n",
      "removed 'ggml-cuda/scale.o'\n",
      "removed 'ggml-cuda/softmax.o'\n",
      "removed 'ggml-cuda/sumrows.o'\n",
      "removed 'ggml-cuda/tsembd.o'\n",
      "removed 'ggml-cuda/unary.o'\n",
      "removed 'ggml-cuda/upscale.o'\n",
      "find examples pocs -type f -name \"*.o\" -delete\n",
      "I ccache not found. Consider installing it for faster compilation.\n",
      "I llama.cpp build info: \n",
      "I UNAME_S:   Linux\n",
      "I UNAME_P:   x86_64\n",
      "I UNAME_M:   x86_64\n",
      "I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion \n",
      "I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS \n",
      "I NVCCFLAGS: -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \n",
      "I LDFLAGS:   -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "I CC:        cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "I CXX:       c++ (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "I NVCC:      Build cuda_12.1.r12.1/compiler.32688072_0\n",
      "\n",
      "!!!!\n",
      "LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.\n",
      "!!!!\n",
      "\n",
      "cc  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion    -c ggml.c -o ggml.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c llama.cpp -o llama.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/common.cpp -o common.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/sampling.cpp -o sampling.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/grammar-parser.cpp -o grammar-parser.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/json-schema-to-grammar.cpp -o json-schema-to-grammar.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/console.cpp -o console.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c sgemm.cpp -o sgemm.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda.cu -o ggml-cuda.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/acc.cu -o ggml-cuda/acc.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/alibi.cu -o ggml-cuda/alibi.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/arange.cu -o ggml-cuda/arange.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/argsort.cu -o ggml-cuda/argsort.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/binbcast.cu -o ggml-cuda/binbcast.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/clamp.cu -o ggml-cuda/clamp.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/concat.cu -o ggml-cuda/concat.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/convert.cu -o ggml-cuda/convert.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/cpy.cu -o ggml-cuda/cpy.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/diagmask.cu -o ggml-cuda/diagmask.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/dmmv.cu -o ggml-cuda/dmmv.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/fattn.cu -o ggml-cuda/fattn.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/getrows.cu -o ggml-cuda/getrows.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/im2col.cu -o ggml-cuda/im2col.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/mmq.cu -o ggml-cuda/mmq.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/mmvq.cu -o ggml-cuda/mmvq.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/norm.cu -o ggml-cuda/norm.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/pad.cu -o ggml-cuda/pad.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/pool2d.cu -o ggml-cuda/pool2d.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/quantize.cu -o ggml-cuda/quantize.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/rope.cu -o ggml-cuda/rope.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/scale.cu -o ggml-cuda/scale.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/softmax.cu -o ggml-cuda/softmax.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/sumrows.cu -o ggml-cuda/sumrows.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/tsembd.cu -o ggml-cuda/tsembd.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/unary.cu -o ggml-cuda/unary.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/upscale.cu -o ggml-cuda/upscale.o\n",
      "cc  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion    -c ggml-alloc.c -o ggml-alloc.o\n",
      "cc  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion    -c ggml-backend.c -o ggml-backend.o\n",
      "cc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion     -c ggml-quants.c -o ggml-quants.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c unicode.cpp -o unicode.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c unicode-data.cpp -o unicode-data.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/train.cpp -o train.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/ngram-cache.cpp -o ngram-cache.o\n",
      "cc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion  -c tests/test-c.c -o tests/test-c.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/build-info.cpp -o build-info.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/main/main.cpp -o examples/main/main.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/quantize/quantize.cpp -o examples/quantize/quantize.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/quantize-stats/quantize-stats.cpp -o examples/quantize-stats/quantize-stats.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/perplexity/perplexity.cpp -o examples/perplexity/perplexity.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/imatrix/imatrix.cpp -o examples/imatrix/imatrix.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/embedding/embedding.cpp -o examples/embedding/embedding.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c pocs/vdot/vdot.cpp -o pocs/vdot/vdot.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c pocs/vdot/q8dot.cpp -o pocs/vdot/q8dot.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/train-text-from-scratch/train-text-from-scratch.cpp -o examples/train-text-from-scratch/train-text-from-scratch.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp -o examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/simple/simple.cpp -o examples/simple/simple.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/batched/batched.cpp -o examples/batched/batched.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/batched-bench/batched-bench.cpp -o examples/batched-bench/batched-bench.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/save-load-state/save-load-state.cpp -o examples/save-load-state/save-load-state.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/server/server.cpp -o examples/server/server.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/gguf/gguf.cpp -o examples/gguf/gguf.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/gguf-split/gguf-split.cpp -o examples/gguf-split/gguf-split.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/eval-callback/eval-callback.cpp -o examples/eval-callback/eval-callback.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llama-bench/llama-bench.cpp -o examples/llama-bench/llama-bench.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -static -fPIC -c examples/llava/llava.cpp -o libllava.a -Wno-cast-qual\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llava/llava-cli.cpp -o examples/llava/llava-cli.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/baby-llama/baby-llama.cpp -o examples/baby-llama/baby-llama.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/beam-search/beam-search.cpp -o examples/beam-search/beam-search.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/retrieval/retrieval.cpp -o examples/retrieval/retrieval.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/speculative/speculative.cpp -o examples/speculative/speculative.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/infill/infill.cpp -o examples/infill/infill.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/tokenize/tokenize.cpp -o examples/tokenize/tokenize.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/benchmark/benchmark-matmult.cpp -o examples/benchmark/benchmark-matmult.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/parallel/parallel.cpp -o examples/parallel/parallel.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/finetune/finetune.cpp -o examples/finetune/finetune.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/export-lora/export-lora.cpp -o examples/export-lora/export-lora.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookahead/lookahead.cpp -o examples/lookahead/lookahead.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup.cpp -o examples/lookup/lookup.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/passkey/passkey.cpp -o examples/passkey/passkey.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/gritlm/gritlm.cpp -o examples/gritlm/gritlm.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/gguf/gguf.o -o gguf -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o pocs/vdot/q8dot.o -o q8dot -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o pocs/vdot/vdot.o -o vdot -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/tokenize/tokenize.o -o tokenize -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o train.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/baby-llama/baby-llama.o -o baby-llama -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/eval-callback/eval-callback.o -o eval-callback -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  build-info.o ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/benchmark/benchmark-matmult.o -o benchmark-matmult -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/beam-search/beam-search.o -o beam-search -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/save-load-state/save-load-state.o -o save-load-state -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/gguf-split/gguf-split.o -o gguf-split -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/embedding/embedding.o -o embedding -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/simple/simple.o -o simple -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/batched/batched.o -o batched -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  build-info.o ggml.o llama.o common.o sampling.o grammar-parser.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/batched-bench/batched-bench.o -o batched-bench -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/passkey/passkey.o -o passkey -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/gritlm/gritlm.o -o gritlm -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/export-lora/export-lora.o -o export-lora -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llava/clip.cpp  -o examples/llava/clip.o -Wno-cast-qual\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup.o -o lookup -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o train.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/train-text-from-scratch/train-text-from-scratch.o -o train-text-from-scratch -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookahead/lookahead.o -o lookahead -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/parallel/parallel.o -o parallel -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/quantize/quantize.o -o quantize -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup-create.cpp -o examples/lookup/lookup-create.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.o -o convert-llama2c-to-ggml -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/retrieval/retrieval.o -o retrieval -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/imatrix/imatrix.o -o imatrix -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o train.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/finetune/finetune.o -o finetune -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/speculative/speculative.o -o speculative -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o console.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/infill/infill.o -o infill -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup-create.o -o lookup-create -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup-merge.cpp -o examples/lookup/lookup-merge.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o console.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/main/main.o -o main -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "\n",
      "====  Run ./main -h for help.  ====\n",
      "\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup-merge.o -o lookup-merge -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/perplexity/perplexity.o -o perplexity -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup-stats.cpp -o examples/lookup/lookup-stats.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  build-info.o ggml.o llama.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/quantize-stats/quantize-stats.o -o quantize-stats -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup-stats.o -o lookup-stats -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/llama-bench/llama-bench.o -o llama-bench -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llava/llava.cpp -o examples/llava/llava.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/llava/llava-cli.o examples/llava/clip.o examples/llava/llava.o -o llava-cli -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o -Iexamples/server examples/server/server.o -o server -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib  \n"
     ]
    }
   ],
   "source": [
    "# Build\n",
    "!make clean && LLAMA_CUBLAS=1 make -j"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3ca0e26e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting numpy~=1.24.4 (from -r ./requirements/requirements-convert.txt (line 1))\n",
      "  Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
      "Collecting sentencepiece~=0.2.0 (from -r ./requirements/requirements-convert.txt (line 2))\n",
      "  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n",
      "Collecting transformers<5.0.0,>=4.40.1 (from -r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.0/138.0 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting gguf>=0.1.0 (from -r ./requirements/requirements-convert.txt (line 4))\n",
      "  Downloading gguf-0.6.0-py3-none-any.whl.metadata (3.2 kB)\n",
      "Collecting protobuf<5.0.0,>=4.21.0 (from -r ./requirements/requirements-convert.txt (line 5))\n",
      "  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)\n",
      "Collecting torch~=2.1.1 (from -r ./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n",
      "  Downloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)\n",
      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (3.13.1)\n",
      "Collecting huggingface-hub<1.0,>=0.19.3 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)\n",
      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (23.2)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (6.0.1)\n",
      "Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.8/40.8 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (2.31.0)\n",
      "Collecting tokenizers<0.20,>=0.19 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
      "Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n",
      "Collecting tqdm>=4.27 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.6/57.6 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (4.9.0)\n",
      "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (1.12)\n",
      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (3.2.1)\n",
      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (3.1.3)\n",
      "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (2024.2.0)\n",
      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (8.9.2.26)\n",
      "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.3.1)\n",
      "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (11.0.2.54)\n",
      "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (10.3.2.106)\n",
      "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (11.4.5.107)\n",
      "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.0.106)\n",
      "Collecting nvidia-nccl-cu12==2.18.1 (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n",
      "  Downloading nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)\n",
      "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Collecting triton==2.1.0 (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n",
      "  Downloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)\n",
      "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.3.101)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (2.1.5)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (3.6)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (2.2.0)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (2024.2.2)\n",
      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (1.3.0)\n",
      "Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m68.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m74.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading transformers-4.40.2-py3-none-any.whl (9.0 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.0/9.0 MB\u001b[0m \u001b[31m121.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading gguf-0.6.0-py3-none-any.whl (23 kB)\n",
      "Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl (294 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.6/294.6 kB\u001b[0m \u001b[31m42.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m670.2/670.2 MB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl (209.8 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.8/209.8 MB\u001b[0m \u001b[31m28.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.2/89.2 MB\u001b[0m \u001b[31m52.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m401.2/401.2 kB\u001b[0m \u001b[31m45.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (774 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m774.1/774.1 kB\u001b[0m \u001b[31m63.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m92.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m134.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.3/78.3 kB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: sentencepiece, triton, tqdm, safetensors, regex, protobuf, nvidia-nccl-cu12, numpy, huggingface-hub, gguf, torch, tokenizers, transformers\n",
      "  Attempting uninstall: triton\n",
      "    Found existing installation: triton 2.2.0\n",
      "    Uninstalling triton-2.2.0:\n",
      "      Successfully uninstalled triton-2.2.0\n",
      "  Attempting uninstall: nvidia-nccl-cu12\n",
      "    Found existing installation: nvidia-nccl-cu12 2.19.3\n",
      "    Uninstalling nvidia-nccl-cu12-2.19.3:\n",
      "      Successfully uninstalled nvidia-nccl-cu12-2.19.3\n",
      "  Attempting uninstall: numpy\n",
      "    Found existing installation: numpy 1.26.3\n",
      "    Uninstalling numpy-1.26.3:\n",
      "      Successfully uninstalled numpy-1.26.3\n",
      "  Attempting uninstall: torch\n",
      "    Found existing installation: torch 2.2.0\n",
      "    Uninstalling torch-2.2.0:\n",
      "      Successfully uninstalled torch-2.2.0\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "torchaudio 2.2.0 requires torch==2.2.0, but you have torch 2.1.2 which is incompatible.\n",
      "torchvision 0.17.0 requires torch==2.2.0, but you have torch 2.1.2 which is incompatible.\u001b[0m\u001b[31m\n",
      "\u001b[0mSuccessfully installed gguf-0.6.0 huggingface-hub-0.23.0 numpy-1.24.4 nvidia-nccl-cu12-2.18.1 protobuf-4.25.3 regex-2024.4.28 safetensors-0.4.3 sentencepiece-0.2.0 tokenizers-0.19.1 torch-2.1.2 tqdm-4.66.4 transformers-4.40.2 triton-2.1.0\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0mINFO:hf-to-gguf:Loading model: Meta-Llama-3-70B\n",
      "INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only\n",
      "INFO:hf-to-gguf:Set model parameters\n",
      "INFO:hf-to-gguf:gguf: context length = 8192\n",
      "INFO:hf-to-gguf:gguf: embedding length = 8192\n",
      "INFO:hf-to-gguf:gguf: feed forward length = 28672\n",
      "INFO:hf-to-gguf:gguf: head count = 64\n",
      "INFO:hf-to-gguf:gguf: key-value head count = 8\n",
      "INFO:hf-to-gguf:gguf: rope theta = 500000.0\n",
      "INFO:hf-to-gguf:gguf: rms norm epsilon = 1e-05\n",
      "INFO:hf-to-gguf:gguf: file type = 1\n",
      "INFO:hf-to-gguf:Set model tokenizer\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "INFO:gguf.vocab:Adding 280147 merge(s).\n",
      "INFO:gguf.vocab:Setting special token type bos to 128000\n",
      "INFO:gguf.vocab:Setting special token type eos to 128001\n",
      "INFO:hf-to-gguf:Exporting model to 'models/70B-v3/ggml-model-f16.gguf'\n",
      "INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> float16, shape = {8192, 128256}\n",
      "INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.0.attn_q.weight,         torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.0.attn_v.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.1.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.1.attn_k.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.1.attn_output.weight,    torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.1.attn_q.weight,         torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.1.attn_v.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00002-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.1.attn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.1.ffn_down.weight,       torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.1.ffn_up.weight,         torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.1.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.2.attn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.2.ffn_down.weight,       torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.2.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.2.ffn_up.weight,         torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.2.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.2.attn_k.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.2.attn_output.weight,    torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.2.attn_q.weight,         torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.2.attn_v.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.3.attn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.3.ffn_down.weight,       torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.3.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.3.ffn_up.weight,         torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.3.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.3.attn_k.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.3.attn_output.weight,    torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.3.attn_q.weight,         torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.3.attn_v.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.4.attn_k.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.4.attn_output.weight,    torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.4.attn_q.weight,         torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.4.attn_v.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00003-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.4.attn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.4.ffn_down.weight,       torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.4.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.4.ffn_up.weight,         torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.4.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.5.attn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.5.ffn_down.weight,       torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.5.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.5.ffn_up.weight,         torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.5.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.5.attn_k.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.5.attn_output.weight,    torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.5.attn_q.weight,         torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.5.attn_v.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.6.attn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.6.ffn_down.weight,       torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.6.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.6.ffn_up.weight,         torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.6.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.6.attn_k.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.6.attn_output.weight,    torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.6.attn_q.weight,         torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.6.attn_v.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.7.attn_k.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.7.attn_q.weight,         torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.7.attn_v.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00004-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.7.attn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.7.ffn_down.weight,       torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.7.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.7.ffn_up.weight,         torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.7.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.7.attn_output.weight,    torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.8.attn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.8.ffn_down.weight,       torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.8.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.8.ffn_up.weight,         torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.8.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.8.attn_k.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.8.attn_output.weight,    torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.8.attn_q.weight,         torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.8.attn_v.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.9.attn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.9.ffn_down.weight,       torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.9.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.9.ffn_up.weight,         torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.9.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.9.attn_k.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.9.attn_output.weight,    torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.9.attn_q.weight,         torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.9.attn_v.weight,         torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00005-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.10.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.10.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.10.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.10.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.10.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.10.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.10.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.10.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.10.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.11.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.11.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.11.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.11.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.11.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.11.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.11.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.11.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.11.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.12.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.12.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.12.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.12.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.12.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.12.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00006-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.12.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.12.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.12.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.13.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.13.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.13.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.13.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.13.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.13.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.13.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.13.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.13.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.14.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.14.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.14.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.14.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.14.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.14.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.14.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.14.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.14.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.15.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.15.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.15.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.15.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.15.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00007-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.15.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.15.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.15.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.15.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.16.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.16.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.16.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.16.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.16.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.16.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.16.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.16.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.16.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.17.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.17.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.17.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.17.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.17.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.17.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.17.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.17.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.17.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.18.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.18.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.18.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.18.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00008-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.18.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.18.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.18.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.18.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.18.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.19.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.19.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.19.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.19.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.19.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.19.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.19.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.19.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.19.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.20.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.20.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.20.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.20.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.20.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.20.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.20.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.20.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.20.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.21.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.21.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.21.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00009-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.21.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.21.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.21.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.21.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.21.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.21.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.22.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.22.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.22.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.22.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.22.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.22.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.22.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.22.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.22.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.23.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.23.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.23.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.23.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.23.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.23.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.23.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.23.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.23.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00010-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.24.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.24.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.24.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.24.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.24.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.24.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.24.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.24.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.24.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.25.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.25.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.25.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.25.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.25.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.25.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.25.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.25.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.25.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.26.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.26.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.26.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.26.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.26.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.26.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00011-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.26.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.26.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.26.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.27.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.27.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.27.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.27.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.27.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.27.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.27.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.27.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.27.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.28.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.28.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.28.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.28.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.28.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.28.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.28.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.28.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.28.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.29.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.29.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.29.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.29.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.29.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00012-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.29.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.29.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.29.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.29.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.30.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.30.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.30.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.30.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.30.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.30.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.30.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.30.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.30.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.31.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.31.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.31.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.31.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.31.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.31.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.31.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.31.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.31.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.32.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.32.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.32.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.32.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00013-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.32.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.32.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.32.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.32.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.32.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.33.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.33.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.33.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.33.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.33.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.33.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.33.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.33.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.33.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.34.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.34.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.34.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.34.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.34.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.34.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.34.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.34.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.34.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.35.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.35.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.35.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00014-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.35.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.35.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.35.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.35.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.35.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.35.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.36.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.36.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.36.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.36.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.36.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.36.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.36.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.36.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.36.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.37.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.37.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.37.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.37.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.37.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.37.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.37.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.37.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.37.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00015-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.38.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.38.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.38.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.38.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.38.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.38.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.38.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.38.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.38.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.39.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.39.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.39.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.39.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.39.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.39.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.39.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.39.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.39.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.40.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.40.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.40.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.40.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.40.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.40.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00016-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.40.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.40.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.40.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.41.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.41.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.41.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.41.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.41.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.41.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.41.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.41.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.41.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.42.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.42.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.42.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.42.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.42.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.42.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.42.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.42.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.42.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.43.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.43.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.43.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.43.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.43.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00017-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.43.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.43.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.43.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.43.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.44.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.44.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.44.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.44.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.44.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.44.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.44.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.44.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.44.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.45.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.45.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.45.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.45.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.45.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.45.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.45.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.45.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.45.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.46.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.46.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.46.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.46.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00018-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.46.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.46.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.46.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.46.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.46.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.47.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.47.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.47.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.47.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.47.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.47.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.47.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.47.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.47.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.48.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.48.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.48.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.48.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.48.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.48.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.48.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.48.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.48.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.49.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.49.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.49.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00019-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.49.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.49.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.49.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.49.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.49.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.49.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.50.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.50.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.50.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.50.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.50.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.50.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.50.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.50.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.50.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.51.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.51.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.51.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.51.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.51.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.51.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.51.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.51.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.51.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00020-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.52.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.52.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.52.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.52.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.52.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.52.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.52.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.52.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.52.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.53.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.53.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.53.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.53.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.53.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.53.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.53.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.53.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.53.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.54.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.54.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.54.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.54.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.54.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.54.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00021-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.54.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.54.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.54.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.55.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.55.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.55.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.55.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.55.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.55.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.55.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.55.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.55.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.56.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.56.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.56.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.56.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.56.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.56.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.56.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.56.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.56.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.57.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.57.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.57.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.57.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.57.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00022-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.57.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.57.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.57.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.57.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.58.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.58.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.58.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.58.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.58.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.58.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.58.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.58.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.58.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.59.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.59.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.59.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.59.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.59.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.59.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.59.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.59.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.59.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.60.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.60.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.60.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.60.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00023-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.60.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.60.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.60.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.60.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.60.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.61.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.61.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.61.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.61.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.61.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.61.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.61.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.61.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.61.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.62.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.62.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.62.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.62.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.62.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.62.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.62.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.62.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.62.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.63.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.63.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.63.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00024-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.63.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.63.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.63.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.63.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.63.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.63.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.64.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.64.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.64.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.64.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.64.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.64.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.64.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.64.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.64.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.65.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.65.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.65.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.65.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.65.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.65.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.65.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.65.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.65.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00025-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.66.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.66.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.66.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.66.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.66.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.66.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.66.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.66.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.66.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.67.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.67.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.67.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.67.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.67.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.67.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.67.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.67.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.67.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.68.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.68.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.68.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.68.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.68.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.68.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00026-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.68.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.68.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.68.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.69.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.69.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.69.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.69.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.69.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.69.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.69.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.69.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.69.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.70.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.70.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.70.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.70.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.70.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.70.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.70.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.70.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.70.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.71.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.71.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.71.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.71.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.71.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00027-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.71.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.71.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.71.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.71.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.72.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.72.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.72.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.72.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.72.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.72.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.72.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.72.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.72.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.73.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.73.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.73.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.73.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.73.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.73.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.73.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.73.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.73.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.74.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.74.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.74.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.74.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00028-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.74.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.74.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.74.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.74.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.74.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.75.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.75.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.75.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.75.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.75.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.75.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.75.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.75.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.75.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.76.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.76.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.76.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.76.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.76.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.76.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.76.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.76.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.76.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.77.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.77.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.77.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00029-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:blk.77.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.77.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.77.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.77.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.77.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.77.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.78.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.78.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.78.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.78.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.78.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.78.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.78.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.78.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.78.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.79.attn_norm.weight,     torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.79.ffn_down.weight,      torch.bfloat16 --> float16, shape = {28672, 8192}\n",
      "INFO:hf-to-gguf:blk.79.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.79.ffn_up.weight,        torch.bfloat16 --> float16, shape = {8192, 28672}\n",
      "INFO:hf-to-gguf:blk.79.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:blk.79.attn_k.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:blk.79.attn_output.weight,   torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.79.attn_q.weight,        torch.bfloat16 --> float16, shape = {8192, 8192}\n",
      "INFO:hf-to-gguf:blk.79.attn_v.weight,        torch.bfloat16 --> float16, shape = {8192, 1024}\n",
      "INFO:hf-to-gguf:output_norm.weight,          torch.bfloat16 --> float32, shape = {8192}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00030-of-00030.safetensors'\n",
      "INFO:hf-to-gguf:output.weight,               torch.bfloat16 --> float16, shape = {8192, 128256}\n",
      "Writing: 100%|█████████████████████████████| 141G/141G [08:45<00:00, 269Mbyte/s]\n",
      "INFO:hf-to-gguf:Model successfully exported to 'models/70B-v3/ggml-model-f16.gguf'\n",
      "/bin/bash: line 1: ./quantize: No such file or directory\n"
     ]
    }
   ],
   "source": [
    "# Run the code if you would like to convert and quantize models by yourself\n",
    "# Install Python dependencies\n",
    "!python3 -m pip install -r requirements.txt\n",
    "\n",
    "# # Convert the HF models to ggml FP16 format (High RAM requirement!)\n",
    "# !python3 convert-hf-to-gguf.py models/Llama-3-8B-GGUF/Meta-Llama-3-8B/ --outfile models/8B-v3/ggml-model-f16.gguf --outtype f16\n",
    "# !python3 convert-hf-to-gguf.py models/Llama-3-70B-GGUF/Meta-Llama-3-70B/ --outfile models/70B-v3/ggml-model-f16.gguf --outtype f16\n",
    "\n",
    "# # Quantize the model to 4-bits (using Q4_K_M method)\n",
    "# !./quantize ./models/8B-v3/ggml-model-f16.gguf ./models/8B-v3/ggml-model-Q4_K_M.gguf Q4_K_M\n",
    "# !./quantize ./models/70B-v3/ggml-model-f16.gguf ./models/70B-v3/ggml-model-Q4_K_M.gguf Q4_K_M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "30466fa3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Log start\n",
      "main: build = 2824 (4426e298)\n",
      "main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu\n",
      "main: seed  = 0\n",
      "llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ./models/8B-v3/ggml-model-Q4_K_M.gguf (version GGUF V3 (latest))\n",
      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
      "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
      "llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B\n",
      "llama_model_loader: - kv   2:                          llama.block_count u32              = 32\n",
      "llama_model_loader: - kv   3:                       llama.context_length u32              = 8192\n",
      "llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096\n",
      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336\n",
      "llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32\n",
      "llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8\n",
      "llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000\n",
      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
      "llama_model_loader: - kv  10:                          general.file_type u32              = 15\n",
      "llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256\n",
      "llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128\n",
      "llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2\n",
      "llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe\n",
      "llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = [\"!\", \"\\\"\", \"#\", \"$\", \"%\", \"&\", \"'\", ...\n",
      "llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\n",
      "llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = [\"Ġ Ġ\", \"Ġ ĠĠĠ\", \"ĠĠ ĠĠ\", \"...\n",
      "llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000\n",
      "llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128001\n",
      "llama_model_loader: - kv  20:               general.quantization_version u32              = 2\n",
      "llama_model_loader: - type  f32:   65 tensors\n",
      "llama_model_loader: - type q4_K:  193 tensors\n",
      "llama_model_loader: - type q6_K:   33 tensors\n",
      "llm_load_vocab: special tokens definition check successful ( 256/128256 ).\n",
      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
      "llm_load_print_meta: arch             = llama\n",
      "llm_load_print_meta: vocab type       = BPE\n",
      "llm_load_print_meta: n_vocab          = 128256\n",
      "llm_load_print_meta: n_merges         = 280147\n",
      "llm_load_print_meta: n_ctx_train      = 8192\n",
      "llm_load_print_meta: n_embd           = 4096\n",
      "llm_load_print_meta: n_head           = 32\n",
      "llm_load_print_meta: n_head_kv        = 8\n",
      "llm_load_print_meta: n_layer          = 32\n",
      "llm_load_print_meta: n_rot            = 128\n",
      "llm_load_print_meta: n_embd_head_k    = 128\n",
      "llm_load_print_meta: n_embd_head_v    = 128\n",
      "llm_load_print_meta: n_gqa            = 4\n",
      "llm_load_print_meta: n_embd_k_gqa     = 1024\n",
      "llm_load_print_meta: n_embd_v_gqa     = 1024\n",
      "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
      "llm_load_print_meta: f_norm_rms_eps   = 1.0e-05\n",
      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
      "llm_load_print_meta: f_logit_scale    = 0.0e+00\n",
      "llm_load_print_meta: n_ff             = 14336\n",
      "llm_load_print_meta: n_expert         = 0\n",
      "llm_load_print_meta: n_expert_used    = 0\n",
      "llm_load_print_meta: causal attn      = 1\n",
      "llm_load_print_meta: pooling type     = 0\n",
      "llm_load_print_meta: rope type        = 0\n",
      "llm_load_print_meta: rope scaling     = linear\n",
      "llm_load_print_meta: freq_base_train  = 500000.0\n",
      "llm_load_print_meta: freq_scale_train = 1\n",
      "llm_load_print_meta: n_yarn_orig_ctx  = 8192\n",
      "llm_load_print_meta: rope_finetuned   = unknown\n",
      "llm_load_print_meta: ssm_d_conv       = 0\n",
      "llm_load_print_meta: ssm_d_inner      = 0\n",
      "llm_load_print_meta: ssm_d_state      = 0\n",
      "llm_load_print_meta: ssm_dt_rank      = 0\n",
      "llm_load_print_meta: model type       = 8B\n",
      "llm_load_print_meta: model ftype      = Q4_K - Medium\n",
      "llm_load_print_meta: model params     = 8.03 B\n",
      "llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW) \n",
      "llm_load_print_meta: general.name     = Meta-Llama-3-8B\n",
      "llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'\n",
      "llm_load_print_meta: EOS token        = 128001 '<|end_of_text|>'\n",
      "llm_load_print_meta: LF token         = 128 'Ä'\n",
      "llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'\n",
      "ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no\n",
      "ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes\n",
      "ggml_cuda_init: found 2 CUDA devices:\n",
      "  Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes\n",
      "  Device 1: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes\n",
      "llm_load_tensors: ggml ctx size =    0.44 MiB\n",
      "llm_load_tensors: offloading 32 repeating layers to GPU\n",
      "llm_load_tensors: offloading non-repeating layers to GPU\n",
      "llm_load_tensors: offloaded 33/33 layers to GPU\n",
      "llm_load_tensors:        CPU buffer size =   281.81 MiB\n",
      "llm_load_tensors:      CUDA0 buffer size =  2113.28 MiB\n",
      "llm_load_tensors:      CUDA1 buffer size =  2290.21 MiB\n",
      "........................................................................................\n",
      "llama_new_context_with_model: n_ctx      = 8192\n",
      "llama_new_context_with_model: n_batch    = 2048\n",
      "llama_new_context_with_model: n_ubatch   = 512\n",
      "llama_new_context_with_model: flash_attn = 0\n",
      "llama_new_context_with_model: freq_base  = 500000.0\n",
      "llama_new_context_with_model: freq_scale = 1\n",
      "llama_kv_cache_init:      CUDA0 KV buffer size =   544.00 MiB\n",
      "llama_kv_cache_init:      CUDA1 KV buffer size =   480.00 MiB\n",
      "llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB\n",
      "llama_new_context_with_model:  CUDA_Host  output buffer size =     0.49 MiB\n",
      "llama_new_context_with_model: pipeline parallelism enabled (n_copies=4)\n",
      "llama_new_context_with_model:      CUDA0 compute buffer size =   640.01 MiB\n",
      "llama_new_context_with_model:      CUDA1 compute buffer size =   640.02 MiB\n",
      "llama_new_context_with_model:  CUDA_Host compute buffer size =    72.02 MiB\n",
      "llama_new_context_with_model: graph nodes  = 1030\n",
      "llama_new_context_with_model: graph splits = 3\n",
      "\n",
      "system_info: n_threads = 80 / 80 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | \n",
      "sampling: \n",
      "\trepeat_last_n = 64, repeat_penalty = 1.100, frequency_penalty = 0.000, presence_penalty = 0.000\n",
      "\ttop_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 1.100\n",
      "\tmirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000\n",
      "sampling order: \n",
      "CFG -> Penalties -> top_k -> tfs_z -> typical_p -> top_p -> min_p -> temperature \n",
      "generate: n_ctx = 8192, n_batch = 2048, n_predict = 1024, n_keep = 0\n",
      "\n",
      "\n",
      "\u001b[33m<|begin_of_text|> First Citizen:\n",
      "\n",
      " Before we proceed any further, hear me speak.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " Speak, speak.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " You are all resolved rather to die than to famish?\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " Resolved. resolved.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " First, you know Caius Marcius is chief enemy to the people.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " We know't, we know't.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " Let us kill him, and we'll have corn at our own price. Is't a verdict?\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " No more talking on't; let it be done: away, away!\n",
      "\n",
      " \n",
      "\n",
      " Second Citizen:\n",
      "\n",
      " One word, good citizens.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " We are accounted poor citizens, the patricians good. What authority surfeits on would relieve us: if they would yield us but the superfluity,  while it were wholesome, we might guess they relieved us humanely; but they think we are too dear: the leanness that afflicts us, the object of  our misery, is as an inventory to particularise their abundance; our sufferance is a gain to them Let us revenge this with our pikes,  ere we become rakes: for the gods know I speak this in hunger for bread, not in thirst for revenge.\n",
      "\n",
      " \n",
      "\n",
      " \u001b[0m2d Cit:\n",
      "\n",
      " If there were no more but the empty titles madam, bare saucy names of no meaning,  if not 'splain'd,  the swaying counterfeit of metal or the canker'd hue of mint; would I might never come where Grace and virtue should like heaven to ascend! No, let the gentles in our city barks,  give priests news liberty: but hear me speak.\n",
      "\n",
      " \n",
      "\n",
      " Marc:\n",
      "\n",
      " Mark him. Hearken, villains--I will  deal with you!\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " Let's beat him down again: the late tragedy of Coriolanus doth show what mischief  may be done by power. If he can brook this let him absent himself and crank up higher; in his own conceit a pearl in woe, a treasure but of purpose lost, then let him not make a venture of  ten groats in our lives: these feats he'll fill where such as bear the shift of  panthers and go dare tigers; shake patience on a  carcase like a cat. He'll make a hero out of Hercules: short and satisfying to one's knowledge, shall be drunk with honor. Of this state arises  the wonder that 's help'd the wonder.\n",
      "\n",
      " \n",
      "\n",
      " Marc:\n",
      "\n",
      " Why all the compass of my blood, lying on the earth, But in my eyes there does some kindle of vexation; Shall fire quit of heaviness?--young lads, forget you not your parts, and let us at our suit down with Mark Antony. (Exeunt.)\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "<|end_of_text|> [end of text]\n",
      "\n",
      "llama_print_timings:        load time =    2165.32 ms\n",
      "llama_print_timings:      sample time =     700.88 ms /   321 runs   (    2.18 ms per token,   458.00 tokens per second)\n",
      "llama_print_timings: prompt eval time =      68.97 ms /   258 tokens (    0.27 ms per token,  3740.76 tokens per second)\n",
      "llama_print_timings:        eval time =    2682.69 ms /   320 runs   (    8.38 ms per token,   119.28 tokens per second)\n",
      "llama_print_timings:       total time =    3809.43 ms /   578 tokens\n",
      "Log end\n"
     ]
    }
   ],
   "source": [
    "# Start inference on a gguf model (-h to show all options)\n",
    "!./main -ngl 10000 -m ./models/8B-v3/ggml-model-Q4_K_M.gguf --color --temp 1.1 --repeat_penalty 1.1 -c 0 -n 1024 -e -s 0 -p \"\"\"\\\n",
    "First Citizen:\\n\\n\\\n",
    "Before we proceed any further, hear me speak.\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "Speak, speak.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "You are all resolved rather to die than to famish?\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "Resolved. resolved.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "First, you know Caius Marcius is chief enemy to the people.\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "We know't, we know't.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "Let us kill him, and we'll have corn at our own price. Is't a verdict?\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "No more talking on't; let it be done: away, away!\\n\\n\\\n",
    "\\n\\n\\\n",
    "Second Citizen:\\n\\n\\\n",
    "One word, good citizens.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "We are accounted poor citizens, the patricians good. What authority surfeits on would relieve us: if they would yield us but the superfluity, \\\n",
    "while it were wholesome, we might guess they relieved us humanely; but they think we are too dear: the leanness that afflicts us, the object of \\\n",
    "our misery, is as an inventory to particularise their abundance; our sufferance is a gain to them Let us revenge this with our pikes, \\\n",
    "ere we become rakes: for the gods know I speak this in hunger for bread, not in thirst for revenge.\\n\\n\\\n",
    "\\n\\n\\\n",
    "\"\"\"\n",
    "\n",
    "# # Chat template for Termianl (Use the instruction-tuned models to better follow the template)\n",
    "# !./main -ngl 10000 -m ./models/8B-v3-instruct/ggml-model-Q4_K_M.gguf --color -c 0 -n -2 -e -s 0 --mirostat 2 -i --no-display-prompt --keep -1 \\\n",
    "# -r '<|eot_id|>' -p '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\nHi!<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' \\\n",
    "# --in-prefix '<|start_header_id|>user<|end_header_id|>\\n\\n' --in-suffix '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17c21608",
   "metadata": {},
   "source": [
    "## Benchmarks"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b6aee37",
   "metadata": {},
   "source": [
    "### 8B Q4_K_M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "996ee79e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no\n",
      "ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes\n",
      "ggml_cuda_init: found 2 CUDA devices:\n",
      "  Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes\n",
      "  Device 1: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes\n",
      "| model                          |       size |     params | backend    | ngl | test       |              t/s |\n",
      "| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | pp 512     |  7003.51 ± 18.04 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | pp 1024    |   8545.00 ± 6.17 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | pp 4096    |   8422.04 ± 7.60 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | pp 8192    |   6895.68 ± 0.91 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | tg 512     |    124.65 ± 0.08 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | tg 1024    |    122.56 ± 0.10 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | tg 4096    |    114.32 ± 0.12 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | tg 8192    |    106.18 ± 0.04 |\n",
      "\n",
      "build: 4426e298 (2824)\n"
     ]
    }
   ],
   "source": [
    "!./llama-bench -p 512,1024,4096,8192 -n 512,1024,4096,8192 -m ./models/8B-v3/ggml-model-Q4_K_M.gguf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f07d6cd",
   "metadata": {},
   "source": [
    "### 8B F16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cb0b80b3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no\n",
      "ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes\n",
      "ggml_cuda_init: found 2 CUDA devices:\n",
      "  Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes\n",
      "  Device 1: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes\n",
      "| model                          |       size |     params | backend    | ngl | test       |              t/s |\n",
      "| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 | pp 512     |  9177.92 ± 38.44 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 | pp 1024    | 11094.51 ± 33.67 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 | pp 4096    |  10329.29 ± 6.40 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 | pp 8192    |   8067.29 ± 1.17 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 | tg 512     |     53.64 ± 0.01 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 | tg 1024    |     53.27 ± 0.01 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 | tg 4096    |     51.64 ± 0.01 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 | tg 8192    |     49.83 ± 0.15 |\n",
      "\n",
      "build: 4426e298 (2824)\n"
     ]
    }
   ],
   "source": [
    "!./llama-bench -p 512,1024,4096,8192 -n 512,1024,4096,8192 -m ./models/8B-v3/ggml-model-f16.gguf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "72d34884",
   "metadata": {},
   "source": [
    "### 70B Q4_K_M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8a4858d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no\n",
      "ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes\n",
      "ggml_cuda_init: found 2 CUDA devices:\n",
      "  Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes\n",
      "  Device 1: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes\n",
      "| model                          |       size |     params | backend    | ngl | test       |              t/s |\n",
      "| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |\n",
      "| llama 70B Q4_K - Medium        |  39.59 GiB |    70.55 B | CUDA       |  99 | pp 512     |    839.43 ± 0.36 |\n",
      "| llama 70B Q4_K - Medium        |  39.59 GiB |    70.55 B | CUDA       |  99 | pp 1024    |    905.38 ± 0.42 |\n",
      "| llama 70B Q4_K - Medium        |  39.59 GiB |    70.55 B | CUDA       |  99 | pp 4096    |    846.38 ± 1.05 |\n",
      "| llama 70B Q4_K - Medium        |  39.59 GiB |    70.55 B | CUDA       |  99 | pp 8192    |    723.24 ± 0.52 |\n",
      "| llama 70B Q4_K - Medium        |  39.59 GiB |    70.55 B | CUDA       |  99 | tg 512     |     19.22 ± 0.01 |\n",
      "| llama 70B Q4_K - Medium        |  39.59 GiB |    70.55 B | CUDA       |  99 | tg 1024    |     19.06 ± 0.00 |\n",
      "| llama 70B Q4_K - Medium        |  39.59 GiB |    70.55 B | CUDA       |  99 | tg 4096    |     18.54 ± 0.00 |\n",
      "| llama 70B Q4_K - Medium        |  39.59 GiB |    70.55 B | CUDA       |  99 | tg 8192    |     17.92 ± 0.00 |\n",
      "\n",
      "build: 4426e298 (2824)\n"
     ]
    }
   ],
   "source": [
    "!./llama-bench -p 512,1024,4096,8192 -n 512,1024,4096,8192 -m ./models/70B-v3/ggml-model-Q4_K_M.gguf"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
